def transform_dict(in_dict,append,index_list,max_level,append_len, level = 0):
    if level < max_level:
        new_key = list(append.values())[level]
        in_dict[new_key] = {}
        level += 1
        transform_dict(in_dict[new_key], append, index_list, max_level,append_len,level)
    elif level >= max_level and level < append_len:
        new_key = list(append.keys())[level]
        new_value = list(append.values())[level]
        in_dict[new_key] = float(new_value)
        level += 1
        transform_dict(in_dict, append, index_list, max_level,append_len,level)
    return in_dict

print(transform_dict({},{'Branch': 'A', 'Customer type': 'Member', 'gross income': '30.91', 'Rating': '6.6'},['Branch','Customer type'],2,4))

{'A': {'Member': {'gross income': 30.91, 'Rating': 6.6}}}

def sum_values(existing, addition):
    return float(existing+addition)

def mean_values(existing, addition):
    if type(existing) == tuple:
        sum_val = existing[0] + addition
        count_val = existing[1] + 1
    else:
        sum_val = existing + addition
        count_val = 2
    mean_val = sum_val/count_val
    return tuple([sum_val,count_val,mean_val])

def dict_addition(in_dict,append,values_dict,append_len,level = 0):
    select_operation = {'sum': sum_values, 'mean':mean_values}
    level_key = list(append.keys())[0]
    value_col = list(values_dict.keys())
    if in_dict.get(level_key,False) != False and level_key not in value_col: #key exists, inspect next level
        level += 1
        dict_addition(in_dict[level_key],append[level_key],values_dict,append_len,level = level)
    elif in_dict.get(level_key,False) != False and level_key in value_col and level < append_len: #acumulation of values
        for indx, key in enumerate(in_dict):
            in_dict[key] = select_operation[values_dict[key]](in_dict[key],append[list(in_dict.keys())[indx]])
    else:
        in_dict[level_key] = append[level_key] #key does not exist, appending whole dict
    return

out_file = {}
for row in [{'A': {'Member': {'gross income': 30}}}, 
            {'B': {'Normal': {'gross income': 10}}}, 
            {'A': {'Member': {'gross income': 40}}},
            {'A': {'Normal': {'gross income': 20}}}]:
    dict_addition(out_file, row, {'gross income':'sum'}, 4)
print(out_file)

{'A': {'Member': {'gross income': 70.0}, 'Normal': {'gross income': 20}}, 'B': {'Normal': {'gross income': 10}}}

def filter_values(split_line,filter_dict,col_mapping):
    for filter in filter_dict:
        for val in filter_dict[filter]:
            if split_line[col_mapping[filter]] == val:
                return True
    return False

print(filter_values(['849-09-3807', 'A', 'Yangon', 'Member', 'Female', 'Fashion accessories', '88.34', '7', '30.919', '649.299', '2/18/2019', '13:28', 'Cash', '618.38', '4.761904762', '30.919', '6.6']
                    ,{'Branch':'A'}
                    ,{'Invoice ID': 0, 'Branch': 1, 'City': 2, 'Customer type': 3, 'Gender': 4, 'Product line': 5, 'Unit price': 6, 'Quantity': 7, 'Tax 5%': 8, 'Total': 9, 'Date': 10, 'Time': 11, 'Payment': 12, 'cogs': 13, 'gross margin percentage': 14, 'gross income': 15, 'Rating': 16}))

True

def adjust_line(line,quoting_char = '"',sep = ','):
    max_indx = len(line)
    fixed_line = ''
    indx_list = []
    for indx,character in enumerate(line):
        if character == quoting_char:
            indx_list.append(indx)
    if len(indx_list) == 0:
        return line
    paired_list = []
    for pair in range(0,len(indx_list),2):
        paired_list.append(tuple([indx_list[pair], indx_list[pair+1]]))
    line_sections = []
    line_pos = 0 
    for indx,section in enumerate(paired_list):
        line_sections.append(tuple([line_pos,section[0]-1]))
        line_sections.append(section)
        line_pos = section[1]+1
        if indx == len(paired_list)-1:
            line_sections.append(tuple([line_pos,max_indx]))
    for section in line_sections:
        if section in paired_list:
            fixed_line = fixed_line+line[section[0]:section[1]+1].replace(sep,'')
        else:
            fixed_line = fixed_line+line[section[0]:section[1]+1]
    return fixed_line

print(adjust_line('849-09-3807, A, Yangon, Member, Female, Fashion accessories'))
print(adjust_line('849-09-3807, A, Yangon, Member, Female,"Food, kitchen and Cooking"'))

849-09-3807, A, Yangon, Member, Female, Fashion accessories
849-09-3807, A, Yangon, Member, Female,"Food kitchen and Cooking"

print(adjust_line('849-09-3807, A, Yangon, Member, Female,"Food, kitchen and Cooking"').split(','))
print('849-09-3807, A, Yangon, Member, Female,"Food, kitchen and Cooking"'.split(','))

['849-09-3807', ' A', ' Yangon', ' Member', ' Female', '"Food kitchen and Cooking"']
['849-09-3807', ' A', ' Yangon', ' Member', ' Female', '"Food', ' kitchen and Cooking"']

def csv2summary(input_file, sep = ',',quoting_char = '"', index_list = [], values_dict = {}, filter_dict = {}, known_headers = [], if_headers = True, if_adjust = False, encode = 'utf-8'):
    out_file = {}
    max_level = len(index_list)
    append_len = max_level+len(values_dict)
    with open(input_file, 'r', encoding=encode) as in_file:
        if if_headers:
            headers = tuple(in_file.readline().strip().split(sep))
        else:
            headers = tuple(known_headers)
        col_mapping = {}
        for col in headers:
            col_mapping[col] = headers.index(col)
        for line in in_file:
            line_keys = {}
            if if_adjust:
                split_line = adjust_line(line.strip(), quoting_char, sep).split(sep)
            else:
                split_line = line.strip().split(sep)
            if filter_values(split_line,filter_dict,col_mapping):
                continue
            for indx_level in index_list:
                line_keys[indx_level] = split_line[col_mapping[indx_level]]
            for value in values_dict:
                line_keys[value] = split_line[col_mapping[value]]
            adjusted_keys = transform_dict({},line_keys,index_list,max_level,append_len)
            dict_addition(out_file, adjusted_keys, values_dict, append_len)      
    return out_file

csv2summary('supermarket_sales.csv', index_list = ['City'], values_dict = {'gross income':'sum'})

{'Yangon': {'gross income': 5057.160500000002},
 'Naypyitaw': {'gross income': 5265.176500000002},
 'Mandalay': {'gross income': 5057.032000000003}}

csv2summary('supermarket_sales.csv', index_list = ['City', 'Gender'], values_dict = {'gross income':'sum'})

{'Yangon': {'Female': {'gross income': 2536.6269999999995},
  'Male': {'gross income': 2520.5335}},
 'Naypyitaw': {'Female': {'gross income': 2937.403000000002},
  'Male': {'gross income': 2327.7735000000007}},
 'Mandalay': {'Female': {'gross income': 2520.395000000001},
  'Male': {'gross income': 2536.637}}}

import pandas as pd

def inspect_item(csvsummary, output, target,subsummary, level = 0, key_index = []):
    if level == 0:
        key_index.append(subsummary)
    for item in csvsummary:
        if type(csvsummary[item]) == dict:
            level += 1
            if len(key_index) == level:
                key_index.append(item)
            else:
                key_index[level] = item
            inspect_item(csvsummary[item],output,target,item,level,key_index)
            level -= 1
        elif item == target:
            value = csvsummary[item]
            output[tuple(key_index)] = value
            key_index.pop()
            
def csvsummary2pandas(csvsummary,target_list):
    output = {}
    for target in target_list:
        output[target] = {}
        for subsummary in csvsummary:
            inspect_item(csvsummary[subsummary], output[target], target, subsummary,level = 0, key_index = [])
   
    return output

csvsummary2pandas(csv2summary('supermarket_sales.csv', index_list = ['City', 'Gender'], values_dict = {'gross income':'sum'}), ['gross income'])

{'gross income': {('Yangon', 'Female'): 2536.6269999999995,
  ('Yangon', 'Male'): 2520.5335,
  ('Naypyitaw', 'Female'): 2937.403000000002,
  ('Naypyitaw', 'Male'): 2327.7735000000007,
  ('Mandalay', 'Female'): 2520.395000000001,
  ('Mandalay', 'Male'): 2536.637}}

test = pd.DataFrame.from_dict(csvsummary2pandas(csv2summary('supermarket_sales.csv', index_list = ['City', 'Gender'], values_dict = {'gross income':'sum'}), ['gross income'])).sort_index()
test

pandas = pd.read_csv('supermarket_sales.csv').groupby(['City', 'Gender']).agg({'gross income':'sum'}).sort_index()
pandas

pandas.merge(test, right_index=True, left_on=['City', 'Gender'])

test = pd.DataFrame.from_dict(csvsummary2pandas(csv2summary('supermarket_sales.csv', index_list = ['Branch','Customer type'], values_dict = {'gross income':'sum'}), ['gross income'])).sort_index()
pandas = pd.read_csv('supermarket_sales.csv').groupby(['Branch','Customer type']).agg({'gross income':'sum'})
pandas.merge(test, right_index=True, left_on=['Branch','Customer type'])

test = pd.DataFrame.from_dict(csvsummary2pandas(csv2summary('supermarket_sales.csv', index_list = ['Branch', 'Gender','Product line'], values_dict = {'gross income':'sum'}), ['gross income'])).sort_index()
pandas = pd.read_csv('supermarket_sales.csv').groupby(['Branch', 'Gender','Product line']).agg({'gross income':'sum'})
pandas.merge(test, right_index=True, left_on=['Branch', 'Gender','Product line'])

import setuptools
%load_ext memory_profiler

%%time
%memit csv2summary('supermarket_sales_mod.csv', index_list = ['City','Product line'], values_dict = {'gross income':'sum'})

peak memory: 110.80 MiB, increment: 0.26 MiB
CPU times: total: 1min 2s
Wall time: 1min 3s

%%time
%memit pd.read_csv('supermarket_sales_mod.csv',low_memory=False).groupby(['City','Product line']).agg({'gross income':'sum'})

peak memory: 6438.32 MiB, increment: 6327.55 MiB
CPU times: total: 33.3 s
Wall time: 36.3 s

%%time
%memit pd.read_csv('supermarket_sales_mod.csv', low_memory=True).groupby(['City','Product line']).agg({'gross income':'sum'})

peak memory: 5549.17 MiB, increment: 5438.44 MiB
CPU times: total: 23.5 s
Wall time: 26.7 s

		gross income_x	gross income_y
Branch	Customer type
A	Member	2554.1655	2554.1655
A	Normal	2502.9950	2502.9950
B	Member	2557.3660	2557.3660
B	Normal	2499.6660	2499.6660
C	Member	2708.6325	2708.6325
C	Normal	2556.5440	2556.5440

			gross income_x	gross income_y
Branch	Gender	Product line
A	Female	Electronic accessories	474.5855	474.5855
		Fashion accessories	468.3915	468.3915
		Food and beverages	333.3220	333.3220
		Health and beauty	272.1380	272.1380
		Home and lifestyle	601.7530	601.7530
		Sports and travel	386.4370	386.4370
	Male	Electronic accessories	397.6580	397.6580
		Fashion accessories	309.3470	309.3470
		Food and beverages	483.9685	483.9685
		Health and beauty	327.7550	327.7550
		Home and lifestyle	465.7325	465.7325
		Sports and travel	536.0725	536.0725
B	Female	Electronic accessories	388.8815	388.8815
		Fashion accessories	432.4520	432.4520
		Food and beverages	500.4760	500.4760
		Health and beauty	304.7785	304.7785
		Home and lifestyle	455.1015	455.1015
		Sports and travel	438.7055	438.7055
	Male	Electronic accessories	423.0920	423.0920
		Fashion accessories	349.1345	349.1345
		Food and beverages	224.0425	224.0425
		Health and beauty	646.6815	646.6815
		Home and lifestyle	380.5730	380.5730
		Sports and travel	513.1135	513.1135
C	Female	Electronic accessories	427.1055	427.1055
		Fashion accessories	548.5565	548.5565
		Food and beverages	745.7695	745.7695
		Health and beauty	306.9400	306.9400
		Home and lifestyle	373.4730	373.4730
		Sports and travel	535.5585	535.5585
	Male	Electronic accessories	476.1790	476.1790
		Fashion accessories	478.1135	478.1135
		Food and beverages	385.9855	385.9855
		Health and beauty	484.2660	484.2660
		Home and lifestyle	288.2200	288.2200
		Sports and travel	215.0095	215.0095

Manual Processing of CSV Files in Python¶

Introduction.¶

Defining Functions.¶

transform_dict.¶

sum_values - mean_values.¶

dict_addition.¶

filter_values.¶

adjust_line.¶

csv2summary.¶

Testing¶

Convert results to pandas¶

Accuracy Test¶

Performace vs large files.¶

Conclusion.¶

		gross income
Mandalay	Female	2520.3950
Mandalay	Male	2536.6370
Naypyitaw	Female	2937.4030
Naypyitaw	Male	2327.7735
Yangon	Female	2536.6270
Yangon	Male	2520.5335

		gross income
City	Gender
Mandalay	Female	2520.3950
Mandalay	Male	2536.6370
Naypyitaw	Female	2937.4030
Naypyitaw	Male	2327.7735
Yangon	Female	2536.6270
Yangon	Male	2520.5335